Kapitel 18 Modals in Slovene Texts
18.1 Programi
library(tidyverse)
library(scales)
library(janitor)
library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(tidytext)
library(readxl)
library(writexl)18.2 Branje besedil
Čivki slovenskih politikov.
slovtwit1 <- read_rds("data/tmls_all_slo_politicians_2021-03-21.rds")
head(slovtwit1)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 258856900 1373313507467403266 2021-03-20 16:40:29 JJansaSDS "Alojzij~ Twitt~
## 2 258856900 1373313373677424643 2021-03-20 16:39:57 JJansaSDS "Aleluja~ Twitt~
## 3 258856900 1373308627658928134 2021-03-20 16:21:06 JJansaSDS "Pozor, ~ Twitt~
## 4 258856900 1373308499942375424 2021-03-20 16:20:35 JJansaSDS "Ne poza~ Twitt~
## 5 258856900 1373308106206277633 2021-03-20 16:19:02 JJansaSDS "Mödendo~ Twitt~
## 6 258856900 1373307672087429124 2021-03-20 16:17:18 JJansaSDS "V četrt~ Twitt~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>, ...
slovtwit <- read_rds("data/tmls_all_slo_politicians_2021-09-18.rds")
head(slovtwit)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 258856900 1439192528507621376 2021-09-18 11:40:12 JJansaSDS "V Zagor~ Twitt~
## 2 258856900 1439191956303777797 2021-09-18 11:37:55 JJansaSDS "Iskrena~ Twitt~
## 3 258856900 1439190521843421185 2021-09-18 11:32:13 JJansaSDS "Discuss~ Twitt~
## 4 258856900 1439190476091973635 2021-09-18 11:32:03 JJansaSDS "Že 1995~ Twitt~
## 5 258856900 1439190233589895170 2021-09-18 11:31:05 JJansaSDS "#Taliba~ Twitt~
## 6 258856900 1439190017390358528 2021-09-18 11:30:13 JJansaSDS "Tako se~ Twitt~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>, ...
Izbor časopisnih člankov, pretežno v rubriki zdravje oz. koronavirus.
parent_folder<-"data/"
filenames <- list.files(parent_folder, recursive=TRUE, full.names = T, pattern = "\\.json$")
head(filenames, 3)## [1] "data//casniki/delo_korona2.json" "data//casniki/delo_mnenja2.json"
## [3] "data//casniki/delo_novice2.json"
# slovbes <- lapply(filenames,load,.GlobalEnv)
slovcas <- readtext(filenames, text_field = "text", encoding = "latin1", verbosity = 0,
ignore_missing_files = T)
slovcas## readtext object consisting of 2674 documents and 4 docvars.
## # Description: df [2,674 x 6]
## doc_id text date title intro url
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 delo_korona2.json.1 "\"Vladni gov\"..." 2020-08-19 Hrvaška ~ Vlada b~ https:/~
## 2 delo_korona2.json.2 "\"Ladja gršk\"..." 2020-08-19 Po zasid~ Kaj se ~ https:/~
## 3 delo_korona2.json.3 "\"V torek so\"..." 2020-08-19 V Sloven~ Aktivno~ https:/~
## 4 delo_korona2.json.4 "\"KLJUČNI PO\"..." 2020-08-19 Na Hrvaš~ V člank~ https:/~
## 5 delo_korona2.json.5 "\"Število ok\"..." 2020-08-18 Kacin: U~ Na Hrva~ https:/~
## 6 delo_korona2.json.6 "\"Nestrpnost\"..." 2020-08-19 Vročina ~ V izbor~ https:/~
## # ... with 2,668 more rows
18.3 Korpus
sltwit_corp <- corpus(slovtwit, text_field = "text")slcas_corp <- corpus(slovcas)library(DT)
slcas_stat <- summary(slcas_corp, n = 2674)
# DT::datatable(slcas_stat, fillContainer = TRUE, filter = "top",
# options = list(pageLength = 4))
DT::datatable(head(slcas_stat, 30),
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("Text",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))Ponovno združi statistiko in text v eno podatkovno zbirko.
slcas <- slcas_stat %>%
rename(doc_id = Text) %>%
left_join(slovcas)slcas_summary <- textstat_summary(slcas_corp) %>%
rename(doc_id = document)slcas1 <- as_tibble(slcas) %>%
left_join(slcas_summary, by = "doc_id")
DT::datatable(head(slcas1, 30),
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("doc_id",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.3.0.1 Zapiši na disk.
# write_csv2(slcas1, "data/slovenski_casniki.csv")18.4 Besedne oblike in matrika
slcas_toks <- tokens(slcas_corp, remove_numbers = T, remove_punct = T,
remove_symbols = T, remove_url = T)
slcas_dfm <- dfm(slcas_toks)18.5 Pogostnost oblik
freqs <- textstat_frequency(slcas_dfm)
DT::datatable(head(freqs, 30),
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("feature",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.6 Konkordance
Za sestavo konkordanc, ki vsebujejo zaželene naklonske zgradbe,
uporabljamo funkcijo kwic() programa quanteda. Konkordanco lahko
ustvarimo iz jezikovnega gradiva (corpus) ali besednega seznama
(tokens). Prednostni način je slednji, saj nam omogoča tudi izločanje
nezaželenih izrazov (npr. url, številk idr.).
slcas_toks2 <- tokens(slcas_corp, remove_numbers = T,
remove_punct = FALSE,
remove_symbols = T, remove_url = T)treba <- kwic(slcas_toks2, pattern = "treba")S funkcijo as_tibble() pretvorimo konkordanćni seznam v podatkovni niz
ali tabelo (prim. Excel). To nam omogoča lepši in preglednejši izpis
preglednice in računanje deležev in drugih količin s funkcijami
programskega svežnja tidyverse.
treba_kwic <- as_tibble(treba)
DT::datatable(treba_kwic,
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))morati <- kwic(slcas_toks2,
pattern = c("morati", "mora", "moram", "moraš",
"morava", "morata", "morajo", "moramo",
"moral", "morala", "morali", "morale",
"moralo"))morati_kwic <- as_tibble(morati)
DT::datatable(morati_kwic,
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))morati_kwic %>%
count(pattern, sort = T) %>%
mutate(pattern = fct_reorder(pattern, n)) %>%
ggplot(aes(n, pattern, fill = rainbow(12))) +
geom_col() +
theme(legend.position = "none")Next, we calculate the frequency and dispersion of tokens per narrative, which contain the terms ‘dark’ and ‘light.’
term1 <- kwic(slcas_toks2, "treba", valuetype = "regex", case_insensitive = T) %>%
group_by(docname) %>%
summarise(hits = n()) %>%
mutate(percentage = hits/(slcas_stat$Tokens[1:length(hits)]/100), searchterm = "treba") %>%
arrange(desc(percentage))
term2 <- kwic(slcas_toks2, "mora*", valuetype = "regex", case_insensitive = T) %>%
group_by(docname) %>%
summarise(hits = n()) %>%
mutate(percentage = hits/(slcas_stat$Tokens[1:length(hits)]/100), searchterm = "mora*") %>% arrange(desc(percentage))term1 %>%
arrange(-percentage) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))term2 %>%
arrange(-percentage) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.7 Kolokacije
collo2 <- textstat_collocations(slcas_corp)
write_rds(collo2, "data/slcas_corp_collocations2.rds")
write_csv(collo2, "data/slcas_corp_collocations2.csv")collo2 %>%
filter(str_detect(collocation, "\\btreba\\b")) %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("collocation",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))collo2 %>%
filter(str_detect(collocation, "\\bmora")) %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("collocation",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))collo3 <- textstat_collocations(slcas_corp, size = 3, tolower = T)collo3 %>%
filter(str_detect(collocation, "\\btreba\\b")) %>%
arrange(-count) %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("collocation",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))collo3 %>%
filter(str_detect(collocation, "\\bmora")) %>%
arrange(-count) %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("collocation",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.8 Konkordance (po povedih)
Ena možnost je tokenizacija po povedih, tako da lahko izvlečemo zgradbo iz posameznih povedi. To ima prednost, da lahko določim dolžino in morebitne druge lastnosti povedi, ki vsebuje naklonsko zgradbo, v primerjavi s povedmi brez naklonske zgradbe.
Nastavitev window = 1 določa, da želimo zajeti eno poved pred iskano
zgradbo in eno po iskani zgradbi.
text <- slovcas$text
toks <- tokens(text, what = 'sentence')“Keyword” je v tem primeru poved z iskano naklonsko zgradbo, “pre” je poved pred njo, “post” pa poved, ki ji sledi.
kwic_mora_poved <- as_tibble(kwic(
# toks, phrase('\\bmora*\\b\\s\\b[a-z]*ti\\b'), # follow each other
toks, phrase('\\bmora*\\b'), # there may be other words in between
valuetype = 'regex', window = 1))
DT::datatable(kwic_mora_poved,
filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))To bi naredili še za druge naklonske zgradbe.
18.9 Konkordance (fraze)
Druga možnost izvleče besedne zveze iz jezikovnega gradiva (corpus). V
tem primeru uporabljamo kwic() in kot vzorec (pattern) izberemo
phrase().
morati2 <- kwic(slcas_corp, valuetype = "glob",
pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)
morati2_kwic <- as_tibble(morati2)
morati2_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))treba2 <- kwic(slcas_corp, valuetype = "glob",
pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)
treba2_kwic <- as_tibble(treba2)
treba2_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))potrebno2 <- kwic(slcas_corp, valuetype = "glob",
pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)
potrebno2_kwic <- as_tibble(potrebno2)
potrebno2_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.9.1 časniki
Zgoraj smo že pridobili konkordance iz besednega seznama (tokens), ki
smo organizirali po povedih (sentences) namesto po besedilih
(doc_id). Konkordance so prikazovale tri povedi (keyword, pre- in
post-).
Spodnji način najprej predvideva pretvorbo jezikovnega gradiva
(corpus) v povedi (sentences), in sicer s funkcijo
corpus_reshape(), ki je sestavni del programa quanteda. Na tak način
prikazujemo naklonsko zgradbo kot keyword, funkcija kwic() pa ima
nastavljeno široko okno, tj. window = 50.
slcas_corp_sent <- corpus_reshape(slcas_corp, to = "sentences", use_docvars = T)Poizvedba: med naklonskim izrazom morati oz. treba in nedoločnikom so lahko tudi druge besede. Vsi izrazi so znotraj določene povedi.
morati3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)
treba3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)Prikaz konkordančne preglednice.
morati3_kwic <- as_tibble(morati3)
morati3_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))treba3_kwic <- as_tibble(treba3)
treba3_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))potrebno3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)
potrebno3_kwic <- as_tibble(potrebno3)
potrebno3_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.9.2 twitter
Pretvorba jezikovnega gradiva, tako da bo osnovna enota poved
(sentences).
sltwit_corp_sent <- corpus_reshape(sltwit_corp, to = "sentences", use_docvars = T)Poizvedba in prikaz konkordančne preglednice.
morati3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)
morati3tw_kwic <- as_tibble(morati3tw)
morati3tw_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))treba3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)
treba3tw_kwic <- as_tibble(treba3tw)
treba3tw_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))potrebno3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)
potrebno3tw_kwic <- as_tibble(potrebno3tw)
potrebno3tw_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))imeti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("ima* za *ti", "ima* za *či",
"imel* za *ti", "imel* za *či")), case_insensitive = T)
imeti3tw_kwic <- as_tibble(imeti3tw)
imeti3tw_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))imeti3twr <- kwic(sltwit_corp_sent, valuetype = "regex", window = 50,
pattern = phrase(c("\\bima*\\b \\bza\\b t$",
"\\bima*\\b \\bza\\b ti$")), case_insensitive = T)
imeti3twr_kwic <- as_tibble(imeti3twr)
imeti3twr_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 10,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))rabiti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("rabi* za *ti", "rabi* za *či")), case_insensitive = T)
rabiti3tw_kwic <- as_tibble(rabiti3tw)
rabiti3tw_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 3,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))rabiti3twr <- kwic(sltwit_corp_sent, valuetype = "regex", window = 50,
pattern = phrase(c("\\brab*\\b \\bza\\b t$",
"\\brab*\\b \\bza\\b ti$")), case_insensitive = T)
rabiti3twr_kwic <- as_tibble(rabiti3twr)
rabiti3twr_kwic %>%
DT::datatable(filter = "top", fillContainer = TRUE,
extensions = 'Buttons', "ColReorder",
options = list(pageLength = 3,
colReorder = TRUE,
dom = 'Bfrtip',
buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
)) %>%
formatStyle("keyword",
target = 'row',
backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))18.10 Deleži glagolov
V nadaljevanju uporabljajo konkordance iz prejšnjega odseka, ki
vsebujejo zveze naklonskega glagola in nedoločnika kot, tj. phrase().
Izračunali bomo deleže nedoločnikov, ki spremljajo naklonski glagol
morati in treba (in nekatere druge).
18.10.1 twitter
m1tw <- morati3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "morati", sample = "twitter")
m1tw %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))morati3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n >5) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")t1tw <- treba3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "treba", sample = "twitter")
t1tw %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))treba3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n > 2) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")p1tw <- potrebno3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "potrebno", sample = "twitter")
p1tw %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))potrebno3tw_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n > 2) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")18.10.2 časniki
m1 <- morati3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "morati", sample = "casniki")
m1 %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))morati3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n >15) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")t1 <- treba3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "treba", sample = "casniki")
t1 %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))treba3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n >5) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")p1 <- potrebno3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
mutate(modal = "potrebno", sample = "časniki")
p1 %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))potrebno3_kwic %>%
separate(keyword, into = c("modal", "inf"), sep = " ") %>%
count(inf, sort = T) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
filter(n >5) %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none")18.10.3 vsa besedila
modalinf0 <- bind_rows(m1,t1,m1tw,t1tw) %>%
select(sample, modal, inf, n, procent, syl_inf) %>%
mutate(modal = tolower(modal), inf = tolower(inf))
modalinf0 %>%
DT::datatable(slcas_stat, fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))modalinf0_all <- modalinf0 %>%
group_by(modal, inf) %>%
arrange(inf) %>%
summarise(freq = sum(n)) %>%
mutate(pct = round(100*freq / sum(freq), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
arrange(-pct)
modalinf0_all %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))# write_rds(modalinf0, "data/modalinf0.rds")
# write_csv(modalinf0, "data/modalinf0.csv")
# write_rds(modalinf0_all, "data/modalinf0_all.rds")
# write_csv(modalinf0_all, "data/modalinf0_all.csv")modalinf <- rbind(m1,t1,p1,m1tw,t1tw,p1tw) %>%
select(sample, modal, inf, n, procent, syl_inf) %>%
mutate(modal = tolower(modal), inf = tolower(inf))
modalinf %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))modalinf_all <- modalinf %>%
group_by(modal, inf) %>%
arrange(inf) %>%
summarise(freq = sum(n)) %>%
mutate(pct = round(100*freq / sum(freq), 2)) %>%
mutate(syl_inf = nsyllable::nsyllable(inf)) %>%
arrange(-pct)
modalinf_all %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))# write_rds(modalinf, "data/modalinf.rds")
# write_csv(modalinf, "data/modalinf.csv")
# write_rds(modalinf_all, "data/modalinf_all.rds")
# write_csv(modalinf_all, "data/modalinf_all.csv")18.10.3.1 glmer
library(lme4)
m000 <- glmer(n ~ 1 + (1|sample), poisson, data = modalinf0)
m001 <- glmer(n ~ 1 + modal*syl_inf +
(1|sample), poisson, data = modalinf0)
anova(m000, m001)## Data: modalinf0
## Models:
## m000: n ~ 1 + (1 | sample)
## m001: n ~ 1 + modal * syl_inf + (1 | sample)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m000 2 12451 12461 -6223.6 12447
## m001 5 10621 10646 -5305.5 10611 1836.1 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(effects)
plot(allEffects(m001), multiline = T)png("pictures/modalinf0_glmer.png")
plot(allEffects(m001), multiline = T)
dev.off()## svg
## 2
18.10.3.2 časniki
modalinf %>%
filter(sample == "časniki") %>%
top_n(30, procent) %>%
ungroup() %>%
mutate(inf = reorder_within(inf, procent, paste0("(", procent, ")"), sep = " ")) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none") +
facet_wrap(~ modal, scales = "free") # both scales are free (instead of free_x, free_y)18.10.3.3 twitter
modalinf %>%
filter(sample == "twitter") %>%
top_n(30, procent) %>%
ungroup() %>%
mutate(inf = reorder_within(inf, procent, paste0("(", procent, ")"), sep = " ")) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none") +
facet_wrap(~ modal, scales = "free") # both scales are free (instead of free_x, free_y)18.10.3.4 vsa
modalinf_all %>%
top_n(30, pct) %>%
ungroup() %>%
mutate(inf = reorder_within(inf, pct, paste0("(", pct, ")"), sep = " ")) %>%
ggplot(aes(pct, inf, fill = inf)) +
geom_col() +
scale_y_reordered() +
theme(legend.position = "none") +
facet_wrap(~ modal, scales = "free") +
labs(x = "procent", y = "")18.10.3.5 brez biti
modalinf %>%
filter(procent > 1) %>%
filter(inf != "biti") %>%
mutate(inf = fct_reorder(inf, n)) %>%
ggplot(aes(procent, inf, fill = inf)) +
geom_col() +
theme(legend.position = "none") +
facet_wrap(~ modal, scales = "free_x")modalinf_all %>%
filter(inf != "biti") %>%
top_n(30, pct) %>%
ungroup() %>%
mutate(inf = reorder_within(inf, pct, paste0("(", pct, ")"), sep = " ")) %>%
ggplot(aes(pct, inf, fill = inf)) +
geom_col() +
scale_y_reordered() +
theme(legend.position = "none") +
facet_wrap(~ modal, scales = "free") +
labs(x = "procent", y = "")18.10.4 Večkratno pojavljanje
Glagoli se pojavljajo enkrat ali večkrat:
x <- as_tibble(rbind(veckrat = c(212,82),
enkrat = c(470-212,214-82))) %>%
rename(morati = V1, treba = V2)
chisq.test(x)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: x
## X-squared = 2.4951, df = 1, p-value = 0.1142
18.10.5 imeti + nedoločnik
imeti3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("ima* za *ti", "ima* za *či",
"imel* za *ti", "imel* za *či")), case_insensitive = T)
imeti3_kwic <- as_tibble(imeti3)
imeti3_kwic %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))imeti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("ima* za *ti", "ima* za *či",
"imel* za *ti", "imel* za *či",
"ima* *ti", "ima* *či",
"imel* *ti", "imel* *či")), case_insensitive = T)
imeti3tw_kwic <- as_tibble(imeti3tw)
imeti3tw_kwic %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.10.6 rabiti + nedoločnik
rabiti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
pattern = phrase(c("rabi* za *ti", "rabi* za *či",
"rabi* *ti", "rabi* *či")), case_insensitive = T)
rabiti3tw_kwic <- as_tibble(rabiti3tw)
rabiti3tw_kwic %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.11 Udpipe analiza
18.11.1 Load model
library(udpipe)
destfile = "slovenian-ssj-ud-2.5-191206.udpipe"
if(!file.exists(destfile)){
language_model <- udpipe_download_model(language = "slovenian")
ud_sl <- udpipe_load_model(language_model$file_model)
} else {
file_model = destfile
ud_sl <- udpipe_load_model(file_model)
}18.11.2 Annotate
library(udpipe)
slovcas_udpiped <- udpipe_annotate(ud_sl, slovcas$text, trace = FALSE)
slovcas_udpiped <- as.data.frame(slovcas_udpiped)18.11.2.1 Save
# write_rds(slovcas_udpiped, "data/slovcas_udpiped.rds")
# write_csv(slovcas_udpiped, "data/slovcas_udpiped.csv")18.11.3 Clean tweets
clean_tweet = gsub("&", " ", slovtwit$text)
clean_tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", clean_tweet)
clean_tweet = gsub("@\\w+", " ", clean_tweet)
# clean_tweet = gsub("[[:punct:]]", " ", clean_tweet)
clean_tweet = gsub("[[:digit:]]", " ", clean_tweet)
# Get rid of URLs
clean_tweet = gsub(
"\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", "", clean_tweet)
# clean_tweet = gsub("https\\w+", " ", clean_tweet)
# clean_tweet = gsub("http\\w+", " ", clean_tweet)
clean_tweet = gsub("[ \t]{2,}", " ", clean_tweet)
clean_tweet = gsub("^\\s+|\\s+$", " ", clean_tweet)
# Get rid of URLs
# clean_tweet <- str_replace_all(
# clean_tweet,"http://t.co/[a-z,A-Z,0-9]*{8}","")
# Take out retweet header, there is only one
clean_tweet <- str_replace(clean_tweet,"RT @[a-z,A-Z]*: "," ")
# Get rid of hashtags
clean_tweet <- str_replace_all(clean_tweet,"#[a-z,A-Z]*"," ")
# Get rid of references to other screennames
clean_tweet <- str_replace_all(clean_tweet,"@[a-z,A-Z]*"," ")
#get rid of unnecessary spaces
clean_tweet <- str_replace_all(clean_tweet," "," ")
slovtwit$text_cleaned <- clean_tweetThe above expression explained: ? optional space (f|ht) match “f” or “ht” tp match “tp” (s?) optionally match “s” if it’s there (://) match “://” (.) match every character (everything) up to [.|/] a period or a forward-slash (.) then everything after that
# regex "((http|ftp|https):\/\/[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)"18.11.4 Annotate tweets
slovtwit_udpiped <- udpipe_annotate(ud_sl,
slovtwit$text_cleaned,
trace = F)
slovtwit_udpiped <- as.data.frame(slovtwit_udpiped)18.11.4.1 Save tweets
# write_rds(slovtwit_udpiped, "data/slovtwit_udpiped.rds")
# write_csv(slovtwit_udpiped, "data/slovtwit_udpiped.csv")18.11.5 tidytext
tokenize_annotate = function(tbl){
tbl %>%
unnest_tokens(word, token, drop = F) %>%
cbind_morphological(term = "feats",
which = c("PronType","NumType","Poss","Reflex",
"Foreign","Abbr","Typo",
"Gender","Animacy","NounClass",
"Case","Number","Definite","Degree",
"VerbForm","Person","Tense","Mood",
"Aspect","Voice","Evident",
"Polarity","Polite","Clusivity")) %>%
mutate(txt = str_replace_all(sentence, "[:punct:]", "")) %>%
mutate(sentlen = quanteda::ntoken(txt)) %>%
mutate(syllables = nsyllable::nsyllable(txt)) %>%
mutate(types = quanteda::ntype(txt)) %>%
mutate(wordlen = syllables/sentlen) %>%
mutate(ttr = types/sentlen) %>%
select(-txt, -feats)
}18.11.5.1 Odpri udpiped
# slovtwit_udpiped <- read_rds("data/slovtwit_udpiped.rds")
# slovcas_udpiped <- read_rds("data/slovcas_udpiped.rds")slovtwit_df <- slovtwit_udpiped %>%
tokenize_annotate() %>% mutate(language = "slv",
texttype = "twitter")slovcas_df <- slovcas_udpiped %>%
tokenize_annotate() %>% mutate(language = "slv",
texttype = "news")slovtxts <- bind_rows(slovtwit_df, slovcas_df)18.11.5.2 Shrani ud
# write_rds(slovtxts, "data/slovtxts.rds")
# write_csv(slovtxts, "data/slovtxts.csv")18.11.5.3 Odpri ud
# slovtxts <- read_rds("data/slovtxts.rds")18.11.5.4 Sample ud
Majhen vzorec za oceno, kolikokrat je uporabljen morati v primeru zunanje nujnosti (ali drugih pomenov).
twitter100 <- slovtxts %>%
filter(texttype == "twitter") %>%
filter(lemma == "morati" | lemma == "treba") %>%
slice_sample(n = 100, replace = TRUE)
# writexl::write_xlsx(twitter100, "data/twitter100.xlsx")18.11.6 Dependenca
slo_deprel <- slovtxts %>%
group_by(texttype) %>%
# filter(texttype == "news" | texttype == "twitter") %>%
count(dep_rel, sort = TRUE) %>%
mutate(pct = round(100*n/sum(n),2)) %>%
pivot_wider(names_from = texttype, values_from = c(n, pct)) %>%
mutate(across(everything(), ~ replace_na(.x, 0))) %>%
mutate(dep_rel =
str_replace(dep_rel, "0", "Unknown"))
slo_deprel %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.11.7 Vizualizacija
library(igraph)
library(ggraph)
library(ggplot2)
plot_annotation <- function(x, size = 3){
stopifnot(is.data.frame(x) & all(c("sentence_id", "token_id", "head_token_id", "dep_rel", "token_id", "token", "lemma", "upos", "xpos", "feats") %in% colnames(x)))
x <- x[!is.na(x$head_token_id), ]
x <- x[x$sentence_id %in% min(x$sentence_id), ]
edges <- x[x$head_token_id != 0, c("token_id", "head_token_id", "dep_rel")]
edges$label <- edges$dep_rel
g <- graph_from_data_frame(edges,
vertices = x[, c("token_id", "token", "lemma", "upos", "xpos", "feats")],
directed = TRUE)
windowsFonts("Arial Narrow" = windowsFont("Arial"))
ggraph(g, layout = "linear") +
geom_edge_arc(ggplot2::aes(label = dep_rel, vjust = -0.20),
arrow = grid::arrow(length = unit(4, 'mm'), ends = "last", type = "closed"),
end_cap = ggraph::label_rect("wordswordswords"),
label_colour = "red", check_overlap = TRUE, label_size = size) +
geom_node_label(ggplot2::aes(label = token), col = "darkgreen", size = size, fontface = "bold") +
geom_node_text(ggplot2::aes(label = upos), nudge_y = -0.35, size = size) +
theme_graph(base_family = "Arial Narrow") +
labs(title = "udpipe output", subtitle = "tokenisation, parts of speech tagging & dependency relations")
}# Slovenian
mytext = "Ali jo bodo morali zapustiti" %>% enc2utf8()
x = udpipe(mytext, "slovenian")
x3 = plot_annotation(x, size = 3)
x3png("pictures/morati_zapustiti.png")
x3
dev.off()## svg
## 2
# Slovenian
mytext = "Ali jo bo treba zapustiti" %>% enc2utf8()
x = udpipe(mytext, "slovenian")
x4 = plot_annotation(x, size = 3)
x4png("pictures/treba_zapustiti.png")
x4
dev.off()## svg
## 2
18.11.8 Predmet pred/za modal
predmet je NOUN ali PRON
x = slovtxts %>%
group_by(lemma) %>%
filter(lemma == "morati" | lemma == "treba" | lemma == "potreben")
x %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))pivot_by_verb_obj = function(tbl){
tbl %>%
mutate(word_order = ifelse(token_id > head_token_id,
"VO",
"OV")) %>%
count(dep_rel, word_order) %>%
mutate(pct = round(100*n/sum(n),2)) %>%
pivot_wider(names_from = texttype, values_from = c(n, pct)) %>%
mutate(across(everything(), ~ replace_na(.x, 0))) %>%
mutate(dep_rel =
str_replace(dep_rel, "0", "Unknown")) %>%
select(-dep_rel)
}vo_nominal_morati = slovtxts %>%
group_by(texttype) %>%
# filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
filter(str_detect(sentence, "\\bmora*")) %>%
filter(dep_rel == "obj" &
upos %in% c("NOUN", "PROPN")) %>%
pivot_by_verb_obj() %>%
mutate(word_class = "NOUN") %>%
mutate(modal = "morati")
vo_nominal_morati %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))vo_nominal_treba = slovtxts %>%
group_by(texttype) %>%
# filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
filter(str_detect(sentence, "\\btreba\\b")) %>%
filter(dep_rel == "obj" &
upos %in% c("NOUN", "PROPN")) %>%
pivot_by_verb_obj() %>%
mutate(word_class = "NOUN") %>%
mutate(modal = "treba")
vo_nominal_treba %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))vo_pronominal_morati = slovtxts %>%
group_by(texttype) %>%
# filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
filter(str_detect(sentence, "\\bmora*")) %>%
filter(dep_rel == "obj" &
upos %in% c("PRON")) %>%
pivot_by_verb_obj() %>%
mutate(word_class = "PRON") %>%
mutate(modal = "morati")
vo_pronominal_morati %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))vo_pronominal_treba = slovtxts %>%
group_by(texttype) %>%
# filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
filter(str_detect(sentence, "\\btreba\\b")) %>%
filter(dep_rel == "obj" &
upos %in% c("PRON")) %>%
pivot_by_verb_obj() %>%
mutate(word_class = "PRON") %>%
mutate(modal = "treba")
vo_pronominal_treba %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))verb_object = bind_rows(vo_nominal_morati,
vo_nominal_treba,
vo_pronominal_morati,
vo_pronominal_treba)
verb_object %>%
# select(-pct_news,-pct_twitter) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))treba, noun: več OV kot VO v twitterju v primerjavi s časniki.
verb_object[3:4,2:3]## # A tibble: 2 x 2
## n_news n_twitter
## <dbl> <dbl>
## 1 238 132
## 2 631 268
verb_object[3:4,2:3] %>%
chisq.test()##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 3.9095, df = 1, p-value = 0.04801
treba, pron: več VO kot OV v twitterju v primerjavi s časniki.
verb_object[7:8,2:3]## # A tibble: 2 x 2
## n_news n_twitter
## <dbl> <dbl>
## 1 261 84
## 2 54 43
verb_object[7:8,2:3] %>%
chisq.test()##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 13.803, df = 1, p-value = 0.000203
# library(esquisse)
# esquisser(verb_object)
ggplot(verb_object) +
aes(x = word_order, y = pct_twitter, fill = modal) +
geom_boxplot(shape = "circle") +
scale_fill_hue(direction = 1) +
theme_minimal()ggplot(verb_object) +
aes(x = word_order, y = pct_news, fill = modal) +
geom_boxplot(shape = "circle") +
scale_fill_hue(direction = 1) +
theme_minimal()ggplot(verb_object) +
aes(x = word_class, y = pct_news, fill = word_order) +
geom_col() +
scale_fill_hue(direction = 1) +
theme_minimal() +
facet_wrap(vars(modal))ggplot(verb_object) +
aes(x = word_class, y = pct_twitter, fill = word_order) +
geom_col() +
scale_fill_hue(direction = 1) +
theme_minimal() +
facet_wrap(vars(modal))18.12 Kwic udpipe
Cilj: analiza nedoločnikov z udpipe in funkcijo
tokenize_annotate(), ki sem jo sestavil zgoraj.
Izbral sem tabelo, ki vsebuje le naklonske zgradbe morati /treba + nedoločnik.
kwic_ud <- modalinf0 %>%
pull(inf) %>%
udpipe_annotate(ud_sl, ., trace = F)
kwic_ud <- as.data.frame(kwic_ud)
kwic_ud <- kwic_ud %>%
tokenize_annotate()
kwic_ud_select <- kwic_ud %>%
select(token, lemma, upos, xpos, morph_aspect, syllables)18.12.1 Združi
modalinf0_ud <- modalinf0 %>%
bind_cols(kwic_ud_select) %>%
select(-token, -lemma, -syllables)18.12.2 Pregled napak
Kateri nedoločniki sploh niso nedoločniki in obratno?
modalinf0_ud %>%
filter(upos != "VERB" & upos != "AUX") %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.12.2.1 Shrani
write_lines(modalinf0_ud %>% pull(inf), "data/modalinf0_ud_inf.txt")18.12.3 Obeliks xml
Zgoraj sem uporabil udpipe za določevanje oblikoslovnih lastnosti
nedoločnikov.
Primerjam še z analizo programa OBELIKS, ki je analiziral nedoločnike
iz datoteke data/modalinf0_ud_inf.txt. Obeliksova izhodna datoteka ima
format xml. Datoteko moram pretvoriti. Pri tem mi pomaga
library(xml2).
Poglej tudi tabele na spletni strani IJS za pretvorbo specifikacij
josMSD: https://nl.ijs.si/jos/josMSD-en.html.
Parsing of xml documents with library(xml2):
gastonsanchez
library(rvest)
library(xml2)
obeliks <- read_xml("data/modalinf0_ud_inf.xml")
obeliks## {xml_document}
## <TEI xmlns="http://www.tei-c.org/ns/1.0">
## [1] <text>\n <body>\n <p>\n <s>\n <w msd="Gp-n" lemma="biti"> ...
We use the xml_length() to know how many elements or nodes are in the
root node:
xml_length(obeliks)## [1] 1
Only one node in this xml document.
I need the information msd from child5.
root = xml_root(obeliks)
child1 = xml_children(root)
child2 = xml_children(child1)
child3 = xml_children(child2)
child4 = xml_children(child3)
child5 = xml_children(child4)
# Access the children nodes of (root) node 1
xml_child(obeliks, search = 1)## {xml_node}
## <text>
## [1] <body>\n <p>\n <s>\n <w msd="Gp-n" lemma="biti">biti</w>\n </ ...
# The function xml_attrs() gives you the attributes of a node. In this case, the node child5 has the attributes msd and lemma.
xml_attrs(child5)[1:3]## [[1]]
## msd lemma
## "Gp-n" "biti"
##
## [[2]]
## msd lemma
## "Ggnn" "imeti"
##
## [[3]]
## msd lemma
## "Ggdn" "narediti"
# Number of children nodes inside a given node. In this case it is 0.
xml_length(child5)[1:3]## [1] 0 0 0
# move along the children nodes
xml_name(xml_children(child5)) # no name## character(0)
xml_name(xml_children(child4))[1:3] # name = "w"## [1] "w" "w" "w"
xml_name(xml_children(child3))[1:3] # name = "s"## [1] "s" "s" "s"
xml_name(xml_children(child2))[1:3] # name = "p"## [1] "p" "p" "p"
xml_name(xml_children(child1)) # name = "body"## [1] "body"
xml_name(xml_children(root)) # name = "text"## [1] "text"
title1 <- xml_child(child4, "w") # in this case: NA
cont <- xml_contents(title1) # in this case: 0
titletxt <- xml_text(title1) # in this case : NA18.12.3.1 Save child5 to csv file:
child5char <- as.character(child5)
write_lines(child5char, "data/obeliks_child5char.txt")18.12.3.2 Open child5char and parse into data frame.
obeliks_delim <- read_delim("data/obeliks_child5char.txt",
delim = " ",
col_names = F) %>% clean_names() %>%
select(-x1) %>%
separate(x2, into = c("x2", "xpos2"),
sep = "=", extra = "merge") %>%
separate(x3, into = c("x3", "inf2"),
sep = "=", extra = "merge") %>%
separate(inf2, into = c("inf2", "x4"),
sep = ">", extra = "merge") %>%
separate(x4, into = c("inf0", "x4"),
sep = "<", extra = "merge") %>%
mutate(xpos2 = str_remove_all(xpos2, '"'),
inf2 = str_remove_all(inf2, '"')) %>%
select(xpos2, inf0)18.12.3.3 Save data frame.
write_csv(obeliks_delim, "data/obeliks_delim.csv")18.12.4 Združi kwic in obeliks
Združim oba podatkovna niza in v nadaljevanju popravljam napake, ki jih
je naredil predvsem udpipe, v manjši meri obeliks. Slednji je
trainiran na slovenska besedila.
# modalinf0_udobx
modalinf0_udoblx <- modalinf0_ud %>%
bind_cols(obeliks_delim) %>%
mutate(upos2 = case_when(
str_detect(xpos2, "^Gg") ~ "VERB",
str_detect(xpos2, "^Gp") ~ "AUX",
TRUE ~ "UNK"
)) %>%
mutate(morph_aspect2 = case_when(
xpos2 == "Ggnn" ~ "Imp",
xpos2 == "Ggdn" ~ "Perf",
xpos2 == "Ggvn" ~ "Both",
xpos2 == "Gp-n" ~ "Imp",
TRUE ~ "UNK")) %>%
mutate(inf = str_replace(inf, "poceti", "početi"),
inf = str_replace(inf, "bti", "biti"),
inf = str_replace(inf, "povećavati", "povečavati"),
inf = str_replace(inf, "prepricati", "prepričati"),
inf = str_replace(inf, "splacati", "splačati"),
inf = str_replace(inf, "spostovati", "spoštovati")) %>%
mutate(upos2 = case_when(
upos == "VERB" &
upos2 == "UNK" ~ "VERB",
TRUE ~ upos2
)) %>%
mutate(upos2 = case_when(
inf == "morebiti" ~ "UNK",
inf == "biti" ~ "AUX",
inf == "temeljiti" ~ "VERB",
inf == "vesti" ~ "VERB",
inf == "pasti" ~ "VERB",
inf == "maksimizirati" ~ "VERB",
inf == "početi" ~ "VERB",
inf == "povečavati" ~ "VERB",
inf == "prepričati" ~ "VERB",
inf == "splačati" ~ "VERB",
inf == "spoštovati" ~ "VERB",
inf == "sesuti" ~ "VERB",
TRUE ~ upos2
)) %>%
mutate(morph_aspect2 = case_when(
upos2 == "VERB" &
upos == "VERB" &
morph_aspect2 == "UNK" ~ morph_aspect,
TRUE ~ morph_aspect2
)) %>%
mutate(xpos2 = case_when(
inf == "biti" ~ "Gp-n",
inf == "sprejeti" ~ "Ggdn",
inf == "reči" ~ "Ggdn",
inf == "vesti" ~ "Ggnn",
inf == "izraziti" ~ "Ggdn",
inf == "zavzeti" ~ "Ggdn",
inf == "pasti" ~ "Ggdn",
inf == "temeljiti" ~ "Ggnn",
inf == "maksimizirati" ~ "Ggvn",
inf == "početi" ~ "Ggvn",
inf == "prepričati" ~ "Ggdn",
inf == "povečavati" ~ "Ggnn",
inf == "razviti" ~ "Ggdn",
inf == "splačati" ~ "Ggvn",
inf == "sesuti" ~ "Ggdn",
inf == "spoštovati" ~ "Ggnn",
inf == "zaužiti" ~ "Ggdn",
TRUE ~ xpos2
)) %>%
mutate(morph_aspect2 = case_when(
xpos2 == "Ggnn" ~ "Imp",
xpos2 == "Ggdn" ~ "Perf",
xpos2 == "Ggvn" ~ "Both",
xpos2 == "Gp-n" ~ "Imp",
TRUE ~ "UNK")) %>%
select(-upos, -xpos, -inf0, -morph_aspect) %>%
rename(upos = upos2, xpos = xpos2,
morph_aspect = morph_aspect2)18.12.4.1 Shrani združeno
write_rds(modalinf0_udoblx,
"data/casniki_twitter_modalinf0_udoblx.rds")
write_csv(modalinf0_udoblx,
"data/casniki_twitter_modalinf0_udoblx.csv")18.12.5 Pretvorba josMSD sl>en
Najprej odpremo in prilagodimo tabelo josMSD, ki sem jo dobil na spletni
strani IJS, josMSD: https://nl.ijs.si/jos/josMSD-en.html.
josmsd = read.delim2(
"data/josMSD.tbl",
# stolpci so ločeni tabulatorsko
sep = "\t",
# univerzalno kodiranje črk
encoding = "UTF-8",
# dodamo imena stolpcev (lemma, word)
col.names = c("koda", "msdslo", "msdeng"),
# preberi kot črkovne nize
stringsAsFactors = F) %>%
select(-koda)Drugi korak: pretvori podatkovni niz v ločena znakovna niza.
# v tem stolpcu je osnovna oblika besede
lemma = as.character(josmsd$msdeng)
# v tem stolpcu je ena izmed besednih oblik
word = as.character(josmsd$msdslo)S funkcijo tokens_replace() programa quanteda pretvorimo slovensko
specifikacijo v angleško.
modalinf0_x <-
# zamenjava seznama (tokens_replace), s pomočjo pull()
tokens_replace(
modalinf0_udoblx %>% pull(xpos) %>% tokens(),
# obliko, ki jo želimo zamenjati
pattern = word,
# zamenjava
replacement = lemma,
# pazi na začetnico
case_insensitive = FALSE,
# natančno ujemanje oblik
valuetype = "fixed")
# zdaj imamo angleško specifikacijo (če je program našel zamenjavo)
modalinf0_x %>% head(3)## Tokens consisting of 3 documents.
## text1 :
## [1] "Va-n"
##
## text2 :
## [1] "Vmpn"
##
## text3 :
## [1] "Vmen"
modalinf0_udoblx$xpos2 <- as.character(modalinf0_x)modalinf0_udoblx %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.12.5.1 shrani xpos eng
write_rds(modalinf0_udoblx,
"data/casniki_twitter_modalinf0_udoblx.rds")
write_csv(modalinf0_udoblx,
"data/casniki_twitter_modalinf0_udoblx.csv")18.12.5.2 Odpri
modalinf0_udoblx <-
read_rds("data/casniki_twitter_modalinf0_udoblx.rds")18.12.6 Oblikoslovje nedoločnikov
aspekt <- modalinf0_udoblx %>%
filter(morph_aspect != "UNK") %>%
group_by(modal) %>%
count(morph_aspect) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
pivot_wider(names_from = "modal",
values_from = c("n", "procent"))
aspekt %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))Brez biti:
aspekt_brez_biti <- modalinf0_udoblx %>%
filter(morph_aspect != "UNK") %>%
filter(inf != "biti") %>%
group_by(modal) %>%
count(morph_aspect) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
pivot_wider(names_from = "modal",
values_from = c("n", "procent"))
aspekt_brez_biti %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))chisq.test(aspekt[,2:3])##
## Pearson's Chi-squared test
##
## data: aspekt[, 2:3]
## X-squared = 0.27055, df = 2, p-value = 0.8735
chisq.test(aspekt_brez_biti[,2:3])##
## Pearson's Chi-squared test
##
## data: aspekt_brez_biti[, 2:3]
## X-squared = 0.17605, df = 2, p-value = 0.9157
Namesto pojavnic različnice (types).
aspect_types <- modalinf0_udoblx %>%
distinct(modal, inf, .keep_all = T) %>%
filter(morph_aspect != "UNK") %>%
# filter(inf != "biti") %>%
group_by(modal) %>%
count(morph_aspect) %>%
mutate(procent = round(100*n / sum(n), 2)) %>%
pivot_wider(names_from = "modal",
values_from = c("n", "procent"))
aspect_types %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))chisq.test(aspect_types[,2:3])##
## Pearson's Chi-squared test
##
## data: aspect_types[, 2:3]
## X-squared = 0.16298, df = 2, p-value = 0.9217
18.13 Slovnična prožnost
Barkema (1996): grammatische Flexibilität Barkema je želel pokazati, da je mnogo ustaljenih izrazov (feste Fügungen) mnogo prožnejših, kot se na splošno trdi.
Stefanowitsch (2006): obratno ta metoda smiselna tudi za dokazovanje, da so mariskatere jezikovne zgradbe, ki se ravnajo po splošnem pravilu, manj prožnih, kot na splošno pričakujejo. To formalno utemeljitev želi uveljaviti za ugotavljanje neke zgradbe kot konstrukcije v smislu konstrukcijske gramatike.
Najprej ustvarimo ustrezno konkordanco za vsa besedila v časnikih in čivkih (tweetih).
18.13.1 konkordanca dveh zgradb
morati4_kwic <- morati3_kwic %>% mutate(sample = "casniki")
treba4_kwic <- treba3_kwic %>% mutate(sample = "casniki")
morati4tw_kwic <- morati3tw_kwic %>% mutate(sample = "twitter")
treba4tw_kwic <- treba3tw_kwic %>% mutate(sample = "twitter")
konkordance_all <- bind_rows(morati4_kwic, treba4_kwic,
morati4tw_kwic, treba4tw_kwic) %>%
mutate(keyword = str_to_lower(keyword)) %>%
separate(keyword, into = c("keyword", "inf"),
sep = " ", extra = "merge")Naslednja preglednica prikazuje oblikoslovni profil glagolskih oblik glagola morati.
konkordance_all %>%
count(keyword, sort = TRUE) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))konkordance_all %>%
count(keyword, inf, sort = TRUE) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))18.13.2 Oblike glagolov (types)
Zgoraj smo izvedli oblikoslovno označevanje (udpipe).
Tu preizkušamo različnice (types) glagola morati in drugih glagolov.
morati_oblike <- slovtxts %>%
filter(upos == "VERB" | upos == "AUX") %>%
filter(lemma != "biti") %>%
filter(lemma == "morati") %>%
group_by(lemma, token) %>%
count(xpos, morph_number, morph_person, morph_tense,
morph_mood, morph_aspect) %>%
arrange(desc(n, xpos))glagoli_oblike <- slovtxts %>%
filter(upos == "VERB" | upos == "AUX") %>%
filter(lemma != "biti") %>%
filter(lemma != "morati") %>%
group_by(lemma, token) %>%
count(xpos, morph_number, morph_person, morph_tense,
morph_mood, morph_aspect) %>%
arrange(desc(n, xpos))18.13.2.1 morph_num
Glagol morati se pogosteje pojavlja v bolj zaznamovanih številih dvojina in množina kot preostali glagoli v vzorcu.
morati_num <- morati_oblike %>% ungroup() %>%
count(morph_number, sort = F, name = "n_morati") %>%
drop_na() %>%
mutate(pct_morati = round(100*n_morati / sum(n_morati), 2))
glagoli_num <- glagoli_oblike %>% ungroup() %>%
count(morph_number, sort = F, name = "n_glagoli") %>%
drop_na() %>%
mutate(pct_glagoli = round(100*n_glagoli / sum(n_glagoli), 2))
glagoli_morph_num <- bind_cols(morati_num, glagoli_num[,2:3])glagoli_morph_num[, c(2,4)] %>% chisq.test()##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 8.6688, df = 2, p-value = 0.01311
18.13.2.2 morph_pers
morati_pers <- morati_oblike %>% ungroup() %>%
count(morph_person, sort = F, name = "n_morati") %>%
drop_na() %>%
mutate(pct_morati = round(100*n_morati / sum(n_morati), 2))
glagoli_pers <- glagoli_oblike %>% ungroup() %>%
count(morph_person, sort = F, name = "n_glagoli") %>%
drop_na() %>%
mutate(pct_glagoli = round(100*n_glagoli / sum(n_glagoli), 2))
glagoli_morph_pers <- bind_cols(morati_pers, glagoli_pers[,2:3])glagoli_morph_pers[, c(2,4)] %>% chisq.test()##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 3.5748, df = 2, p-value = 0.1674
18.13.2.3 morph_tense
morati_tense <- morati_oblike %>% ungroup() %>%
mutate(morph_tense = ifelse(is.na(morph_tense),
"other", morph_tense)) %>%
count(morph_tense, sort = F, name = "n_morati") %>%
# drop_na() %>%
mutate(pct_morati = round(100*n_morati / sum(n_morati), 2))
glagoli_tense <- glagoli_oblike %>% ungroup() %>%
mutate(morph_tense = ifelse(is.na(morph_tense),
"other", morph_tense)) %>%
mutate(morph_tense = ifelse(morph_tense == "Fut", "other",
morph_tense)) %>%
count(morph_tense, sort = F, name = "n_glagoli") %>%
# drop_na() %>%
mutate(pct_glagoli = round(100*n_glagoli / sum(n_glagoli), 2))
glagoli_morph_tense <- bind_cols(morati_tense, glagoli_tense[,2:3])glagoli_morph_tense[, c(2,4)] %>% chisq.test()##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 2.3495, df = 1, p-value = 0.1253
18.13.2.4 morph_mood
morati_mood <- morati_oblike %>% ungroup() %>%
mutate(morph_mood = ifelse(is.na(morph_mood),
"other", morph_mood)) %>%
count(morph_mood, sort = F, name = "n_morati") %>%
# drop_na() %>%
mutate(pct_morati = round(100*n_morati / sum(n_morati), 2))
glagoli_mood <- glagoli_oblike %>% ungroup() %>%
mutate(morph_mood = ifelse(is.na(morph_mood),
"other", morph_mood)) %>%
mutate(morph_mood = case_when(
morph_mood == "Imp" ~ "other",
morph_mood == "Cnd" ~ "other",
TRUE ~ morph_mood)) %>%
count(morph_mood, sort = F, name = "n_glagoli") %>%
# drop_na() %>%
mutate(pct_glagoli = round(100*n_glagoli / sum(n_glagoli), 2))
glagoli_morph_mood <- bind_cols(morati_mood, glagoli_mood[,2:3])glagoli_morph_mood[, c(2,4)] %>% chisq.test()##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 2.2621, df = 1, p-value = 0.1326
18.13.2.5 morph_aspect
morati_aspect <- morati_oblike %>% ungroup() %>%
mutate(morph_aspect = case_when(
morph_aspect == "Perf" ~ "other",
is.na(morph_aspect) ~ "other",
TRUE ~ morph_aspect)) %>%
count(morph_aspect, sort = F, name = "n_morati") %>%
# drop_na() %>%
mutate(pct_morati = round(100*n_morati / sum(n_morati), 2))
glagoli_aspect <- glagoli_oblike %>% ungroup() %>%
mutate(morph_aspect = case_when(
morph_aspect == "Perf" ~ "other",
is.na(morph_aspect) ~ "other",
TRUE ~ morph_aspect)) %>%
count(morph_aspect, sort = F, name = "n_glagoli") %>%
# drop_na() %>%
mutate(pct_glagoli = round(100*n_glagoli / sum(n_glagoli), 2))
glagoli_morph_aspect <- bind_cols(morati_aspect, glagoli_aspect[,2:3])glagoli_morph_aspect[, c(2,4)] %>% chisq.test()##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 30.129, df = 1, p-value = 4.043e-08
18.13.3 Oblike glagolov (tokens)
# slovtxts <- read_rds("data/slovtxts.rds")morati_count <- slovtxts %>%
mutate(lemma = str_to_lower(lemma),
token = str_to_lower(token)) %>%
filter(upos == "VERB" | upos == "AUX") %>%
filter(lemma == "morati") %>%
count(token, sort = T, name = "freq")
library(udpipe)
morati_ud <- morati_count %>%
pull(token) %>%
udpipe_annotate(ud_sl, ., trace = F) %>%
as.data.frame() %>%
cbind_morphological()
morati_types_token <- morati_ud %>%
left_join(morati_count, by = "token")
morati_xpos_freq <- morati_types_token %>%
select(token, lemma, upos, xpos, freq) %>%
filter(token != "morš" & token != "morijo") %>%
filter(upos == "VERB")glagoli_count <- slovtxts %>%
mutate(lemma = str_to_lower(lemma),
token = str_to_lower(token)) %>%
filter(upos == "VERB" | upos == "AUX") %>%
filter(lemma != "morati") %>%
count(token, sort = T, name = "freq")
glagoli_ud <- glagoli_count %>%
pull(token) %>%
udpipe_annotate(ud_sl, ., trace = F) %>%
as.data.frame() %>%
cbind_morphological()
glagoli_types_token <- glagoli_ud %>%
left_join(glagoli_count, by = "token") %>%
filter(lemma != "biti")glagoli_types_token %>%
filter(upos == "VERB") %>%
slice_sample(n = 100) %>%
select(token, lemma, upos, xpos, freq) %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))oblike_glagolov <- "Vmpr3s$|Vmpp-pm$|Vmpr1p$|Vmpr3p$|Vmpp-sm$|Vmpp-sf$|Vmpr1s$|Vmpr2p$|Vmpp-sn$|Vmpr2s$|Vmpp-pf$|Vmpr3d$|Vmpr2d$"
glagoli_xpos_freq <- glagoli_types_token %>%
filter(upos == "VERB") %>%
select(xpos, freq) %>%
group_by(xpos) %>%
summarise(freq_sum = sum(freq)) %>%
arrange(desc(freq_sum)) %>%
filter(str_detect(xpos, oblike_glagolov))Ali se pogostnost oblik glagola morati razlikuje od pogostnosti oblik preostalih (polnopomenskih) glagolov?
Izvedemo lahko binomialni preizkus: binom.test(51, 235, 1/6,
alternative = “two.sided”)
V nadaljevanju sem uporabil hi kvadrat preizkus chisq.test().
Oba podatkovna združujemo in odstranimo nepotrebne stolpce.
xpos_freq <- merge(morati_xpos_freq, glagoli_xpos_freq) %>%
select(-upos, -lemma, token, xpos, freq, freq_sum) %>%
rename(freq_morati = freq, freq_glagoli = freq_sum) %>%
arrange(xpos)Hi kvadrat preizkus lahko opravimo za vsako glagolsko različnico posebej, npr. za prvo na seznamu.
x <- as_tibble(xpos_freq[1,3:4])
y <- bind_cols(sum(xpos_freq[2:12,3]), sum(xpos_freq[2:12,4])) %>%
rename(freq_morati = "...1", freq_glagoli = "...2")
z <- bind_rows(x,y)
chisq.test(z)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: z
## X-squared = 18.475, df = 1, p-value = 1.721e-05
Elegantnejša je zanka for() ali funkcija map(). Preglednejša je
for()-zanka.
options = (scipen = 999)
for(i in 1:length(xpos_freq)){
# frequencies of verb types in question
x <- xpos_freq[i,3:4]
# sum of verb types frequencies without the above verb types
y <- bind_cols(
sum(xpos_freq[-i,3]),
sum(xpos_freq[-i,4])) %>%
rename(freq_morati = "...1", freq_glagoli = "...2")
# create a little data frame
z <- bind_rows(x,y)
# chi-square test results
chi <- chisq.test(z)
# expected freuqencies
expected_morati <- chi$expected[1,1] %>% as.numeric()
expected_glagoli <- chi$expected[1,2] %>% as.numeric()
# test the frequencies and pull the p-values out
p_value <- chisq.test(z) %>%
tidy() %>%
pull(p.value) %>%
as.numeric()
# append the p-values row by row to the data frame
xpos_freq$p_value[i] <- round(p_value, 5)
# append the expected freqs row by row to the data frame
xpos_freq$expected_morati[i] <- round(expected_morati, 2)
xpos_freq$expected_glagoli[i] <- round(expected_glagoli, 2)
}
# Add percentages
xpos_freq <- xpos_freq %>%
mutate(pct_morati =
round(100*freq_morati/sum(freq_morati), 2),
pct_glagoli =
round(100*freq_glagoli/sum(freq_glagoli), 2)) %>%
arrange(desc(pct_morati, pct_glagoli))
# write_rds(xpos_freq, "data/xpos_freq.rds")
# write_csv(xpos_freq, "data/xpos_freq.csv")
xpos_freq %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))Različnice glagola morati imajo statistično različno pogostnost od različnic drugih glagolov v jezikovnem gradivu, z izjemo oblike Vmpp-sm (deležnik na -l moškega spola, npr. moral, gledal, obkljukal, …) in Vmpp-sn (npr. moralo, gledalo, …).
library(tidytext)
library(scales)
xpos_freq %>%
pivot_longer(pct_morati:pct_glagoli,
names_to = "types", values_to = "pct") %>%
group_by(types) %>%
# mutate(xpos = reorder_within(
# xpos, types, paste0(" (",pct,"%)"), sep = "")) %>%
mutate(xpos = fct_reorder(xpos, pct)) %>%
ggplot(aes(pct/100, xpos, fill = types)) +
geom_col() +
theme(legend.position = "none") +
scale_x_continuous(labels = percent) +
facet_wrap(~ types, scales = "free_y") +
labs(x = "", y = "")18.13.4 Značilni nedoločniki
Kateri glagoli se pogosteje povezujejo z morati oz. s treba? Kateri nedoločniki so signifikantni distinktivni kolleksemi enot morati in treba?
# modalinf0_all <- read_rds("data/modalinf0_all.rds")modalinf0_all_by_verbs <- modalinf0_all %>%
pivot_wider(names_from = modal, values_from = c(freq, pct)) %>%
select(inf, freq_morati:pct_treba, syl_inf) %>%
mutate(freq_morati = ifelse(is.na(freq_morati), 0, freq_morati),
freq_treba = ifelse(is.na(freq_treba), 0, freq_treba))options = (scipen = 999)
for(i in 1:length(modalinf0_all_by_verbs)){
# frequencies of verb types in question
x <- modalinf0_all_by_verbs[i,2:3]
# sum of verb types frequencies without the above verb types
y <- bind_cols(
sum(modalinf0_all_by_verbs[-i,2]),
sum(modalinf0_all_by_verbs[-i,3])) %>%
rename(freq_morati = "...1", freq_treba = "...2")
# create a little data frame
z <- bind_rows(x,y)
# chi-square test results
chi <- chisq.test(z)
# expected frequencies
expected_morati <- chi$expected[1,1] %>% as.numeric()
expected_treba <- chi$expected[1,2] %>% as.numeric()
# test the frequencies and pull the p-values out
p_value <- chisq.test(z) %>%
tidy() %>%
pull(p.value) %>%
as.numeric()
# append the p-values / expected freqs row by row to the data frame
modalinf0_all_by_verbs$p_value[i] <- round(p_value, 5)
modalinf0_all_by_verbs$expected_morati[i] <- round(
expected_morati, 2)
modalinf0_all_by_verbs$expected_treba[i] <- round(
expected_treba, 2)
}
# Add percentages
modalinf0_all_by_verbs <- modalinf0_all_by_verbs %>%
mutate(pct_morati =
round(100*freq_morati/sum(freq_morati), 2),
pct_treba =
round(100*freq_treba/sum(freq_treba), 2)) %>%
arrange(desc(pct_morati, pct_treba))
# write_rds(modalinf0_all_by_verbs, "data/modalinf0_all_by_verbs.rds")
# write_csv(modalinf0_all_by_verbs, "data/modalinf0_all_by_verbs.csv")
modalinf0_all_by_verbs %>%
DT::datatable(fillContainer = TRUE, filter = "top",
options = list(pageLength = 4))Pri naslednjih (maloštevilnih) glagolih ni statistično značilne razlike.
modalinf0_all_by_verbs %>%
filter(p_value > .05)## # A tibble: 0 x 9
## # ... with 9 variables: inf <chr>, freq_morati <dbl>, freq_treba <dbl>,
## # pct_morati <dbl>, pct_treba <dbl>, syl_inf <int>, p_value <dbl>,
## # expected_morati <dbl>, expected_treba <dbl>
18.13.5 Hapax legomena
Produktivnost opredeljena z deležem nedoločnikov, ki se le enkrat pojavljajo z morati oz. treba.
s_morati <- modalinf0_all_by_verbs %>%
filter(freq_morati > 0) %>%
count()
h_morati <- modalinf0_all_by_verbs %>%
filter(freq_morati == 1) %>%
count()
produktiv_morati <- h_morati/s_morati
produktiv_morati## n
## 1 0.5098361
s_treba <- modalinf0_all_by_verbs %>%
filter(freq_treba > 0) %>%
count()
h_treba <- modalinf0_all_by_verbs %>%
filter(freq_treba == 1) %>%
count()
produktiv_treba <- h_treba/s_treba
produktiv_treba## n
## 1 0.5889968
Z ozirom na Baayenovo ((Baayen?) 1991) kvantitativno opredelitev produktivnosti bi bila zgradba treba + inf (0,589) nekoliko produktivnejša kot morati + inf (0,51), razlika pa je statistično značilna (p = 0,028).
h = cbind(h_morati, h_treba)
s = cbind(s_morati, s_treba)
z = rbind(h, s-h)
chisq.test(z)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: z
## X-squared = 4.8551, df = 1, p-value = 0.02756
18.14 Disperzijska analiza
Carrols D2 (Carrol 1970) ima vrednosti med 0 in 1: V koliko različnih registrih, žanrih itd. (v uravnoteženem korpusu moramo imeti za vsak žanr / register eno datoteko).
\[a = log\Sigma{p_{1-n}}\] \[b = \Sigma{p_{1-n}}\log{p_{1-n}}\] \[c = \Sigma{p_{1-n}}\]
\[d = \log{n}\]
\[ D = \frac{log\Sigma{p_{1-n}} - (\frac{\Sigma{p_{1-n}}\log{p_{1-n}}}{\Sigma{p_{1-n}}})}{\log{n}} \]
\[p_{1-n}\] je verjetnost pojavljanja nekega obeležja v datotekah 1 do \[n\], in \[n\] je število korpusnih datotek.
Verjetnost pojavljanja je relativna količina, tj. pogostnost z ozirom na velikost datoteke. Za izračun disperzije (razpršenosti) besede, podajamo število vseh besed (konstrukcij), kar predstavlja velikost datoteke.
Ali je treba + nedoločnik manj razpršena kot morati + nedoločnik? Ali je torej za določene besedilne vrste bolj značilna, medtem ko v drugih redko kdaj nastopa?
Nemška HZI-konstrukcija ima manjšo disperzijo kot müssen. HZI je bolje zastopan v pravilnikih, manj pa pripovednih (narativnih) besedilih.
knitr::include_graphics("pictures/disperzija_hzi.png")
18.15 Semantika: kompozicijonalnost
Stefanowitsch(2006: 21): “Zunächst stellt sich aber aus konstruktionsgrammatischer Perspektive die Frage, ob sich diese spezielleren Beschränkungen kompositionell aus der Bedeutung der einzelnen Komponenten der Struktur ableiten lassen. A priori scheint es unwahrscheinlich, dass dies der Fall ist, da die Komponenten haben, zu und der Infinitiv eine Vielzahl grammatischer Funktionen haben, sodass fraglich ist, ob sie überhaupt invariante Bedeutungen haben. … es gibt hier Erfolg versprechende Ansätze, die sich dieser Fragestellung über den Vergleich der Kollokate einer komplexen Struktur mit den Kollokaten der einzelnen Komponenten nähern. Diese Ansätze erfordern allerdings größere Datenmengen. Stattdessen will ich hier sprachvergleichende Überlegungen heranziehen … Die allgemeine Bedeutung dieser Struktur ist, ebenso wie im Deutschen, VERPFLICHTUNG. … Wenn diese sich kompositionell herleiten lassen, sollten sie sich auch bei der englischen Konstruktion finden. Betrachten wir aber die distinktiven Kollexeme der englischen Konstruktion im Vergleich zum übergeordneten Verb must ‚müssen‘ …, so findet sich keinerlei Evidenz für eine ähnliche speziellere Bedeutung. Wie das deutsche müssen hat auch must trotz des ausgewogeneren Häufigkeitsverhältnisses wenige distinktive Kollexeme, die stark an die von müssen erinnern: es sind be ‚sein‘ (have to 27/must 159, 4.80E–18), have ‚haben‘ (9/49, 1.10E–05), know ‚wissen/kennen‘ (1/13, 5.45E–03) und stand ‚stehen‘ (0/7, 2.23E–02). Für [have + to + Infinitiv] gibt es eine Reihe von distinktiven Kollexemen. Allerdings handelt es sich dabei ausschließlich um sehr gebräuchliche Verben, die keiner klaren semantischen Klasse zugeordnet werden können. Es lässt sich allenfalls festhalten, dass es sich überwiegend um dynamische Verben handelt, während die wenigen distinktiven Kollexeme von must statische Verben sind. Ebenso wie im Falle von müssen dürften die epistemischen Verwendungen von must die Ursache für diese Kollexeme sein, denn diese sind für have to zwar möglich, aber selten. Anders als bei der deutschen HZI-Struktur ergeben die distinktiven Kollexeme bei [have + to + Infinitiv] also keine klaren Hinweise auf einen BEHÖRDEN-Frame oder eine andere spezifischere Bedeutung der Struktur.”
knitr::include_graphics("pictures/distinktive_kollexeme.png")
“Dies wird auch durch eine Analyse der Dispersion von [have + to + Infinitiv] im BROWN-Korpus bestätigt, die eindeutig zeigt, dass diese Struktur auch in Bezug auf ihre typischen Gebrauchskontexte nicht eingeschränkt ist: sie hat eine Dispersion von 0,96 und ist damit genauso allgemein verwendbar wie [must + Infinitiv], das ebenfalls eine Dispersion von 0,96 im BROWN-Korpus hat. … Da die deutsche Konstruktion aber die stärker eingeschränkte Bedeutung hat, scheint die Schlussfolgerung plausibel, dass es diese ist, die weniger kompositionell ist.”